Table Pivots

DSST 289: Introduction to Data Science

Erik Fredner

2024-10-02

Exam 1 grade distribution

Real-world pivoting!

library(tidyverse)
library(ggrepel)

grades <- read_csv("../GITIGNORED/exam1.csv")

grades |> slice_sample(n = 2)
# A tibble: 2 × 3
     id open_notes closed_notes
  <dbl>      <dbl>        <dbl>
1    28         49         50  
2    17         44         45.5

pivot_longer

grades <- grades |>
  pivot_longer(
    cols = !id,
    names_to = "exam",
    values_to = "score"
  )

grades |> slice_sample(n = 2)
# A tibble: 2 × 3
     id exam       score
  <dbl> <chr>      <dbl>
1     4 open_notes    47
2    25 open_notes    48

Visualizing exam grades with geom_density

Exam review

Review topics

  • Formatting
  • Inheriting from ggplot
  • Fixed aesthetics
  • select |> distinct
  • Layering geom_point
  • filter %in% vs. filter ==
  • group_by |> summarize
  • count
  • fct_

cats data

cats <- read_csv("../data/domestic_cats.csv")

cats |> slice_sample(n = 3)
# A tibble: 3 × 10
  name    event_id month  long   lat indoors_min indoors_max prey_month   age
  <chr>      <dbl> <dbl> <dbl> <dbl>       <dbl>       <dbl>      <dbl> <dbl>
1 Jinx  1165781419     2  139. -34.9           6          11         10     6
2 Tobin 1121858172    10  139. -35.0          12          17          0     6
3 Kitty  737747770     6  139. -34.9           0           5          1     4
# ℹ 1 more variable: sex <chr>

pokemon data

pokemon <- read_csv("../data/pokemon.csv")

pokemon |> slice_sample(n = 3)
# A tibble: 3 × 13
  pokedex_no name  form  type_1 type_2 stat_total    hp attack defense sp_attack
       <dbl> <chr> <chr> <chr>  <chr>       <dbl> <dbl>  <dbl>   <dbl>     <dbl>
1        789 Cosm… <NA>  Psych… <NA>          200    43     29      31        29
2        362 Glal… <NA>  Ice    <NA>          480    80     80      80        80
3        731 Piki… <NA>  Normal Flying        265    35     75      30        30
# ℹ 3 more variables: sp_defense <dbl>, speed <dbl>, generation <dbl>

Inheriting from ggplot

cats |>
  filter(sex == "m", between(age, 9, 10)) |>
  ggplot(aes(x = long, y = lat)) +
  # `geom_point` inherits `aes` from `ggplot`
  geom_point(alpha = 0.5)

Fixed aesthetics

cats |>
  filter(sex == "m", between(age, 9, 10)) |>
  ggplot(aes(x = long, y = lat)) +
  # NOT inside `aes()`
  geom_point(alpha = 0.5)

select |> distinct

cat_traits <- cats |>
  select(name, indoors_min, prey_month, age, sex) |>
  distinct()

cat_traits |> slice_sample(n = 3)
# A tibble: 3 × 5
  name     indoors_min prey_month   age sex  
  <chr>          <dbl>      <dbl> <dbl> <chr>
1 Muss              12          1     7 m    
2 Lilliput          18          3     3 f    
3 Kitty              0          1     4 m    

Layering geom_point

indoor_cats <- cat_traits |> filter(indoors_min >= 18)

cat_traits |>
  ggplot(aes(x = age, y = prey_month)) +
  geom_point(color = "gray") +
  geom_point(data = indoor_cats, color = "navyblue") +
  geom_text_repel(
    data = indoor_cats,
    aes(label = name),
    color = "navyblue"
  )

Layering geom_point

filter %in% vs. filter ==

pokemon_sample <- pokemon |>
  filter(
    generation == 1,
    type_1 %in% c("Electric", "Rock", "Psychic")
  ) |>
  select(name, generation, type_1, type_2, stat_total, attack, speed)

pokemon_sample |> slice_sample(n = 3)
# A tibble: 3 × 7
  name     generation type_1   type_2 stat_total attack speed
  <chr>         <dbl> <chr>    <chr>       <dbl>  <dbl> <dbl>
1 Voltorb           1 Electric <NA>          330     30   100
2 Kabutops          1 Rock     Water         495    115    80
3 Geodude           1 Rock     Ground        300     80    20

group_by |> summarize

pokemon_sample |>
  group_by(type_1) |>
  # round() is just for printing; not required for exam
  summarize(
    avg_speed = round(mean(speed), 1),
    avg_attack = round(mean(attack), 1),
    avg_stat_total = round(mean(stat_total), 1)
  )
# A tibble: 3 × 4
  type_1   avg_speed avg_attack avg_stat_total
  <chr>        <dbl>      <dbl>          <dbl>
1 Electric     100         62             446.
2 Psychic       93         60.1           470.
3 Rock          58.3       82.2           421.

count

pokemon_sample |>
  count(type_1, type_2)
# A tibble: 8 × 3
  type_1   type_2     n
  <chr>    <chr>  <int>
1 Electric Flying     1
2 Electric Steel      2
3 Electric <NA>       6
4 Psychic  Fairy      1
5 Psychic  <NA>       7
6 Rock     Flying     1
7 Rock     Ground     4
8 Rock     Water      4
# essentially the same:
pokemon_sample |>
  group_by(type_1, type_2) |>
  summarize(n = n())
# A tibble: 8 × 3
# Groups:   type_1 [3]
  type_1   type_2     n
  <chr>    <chr>  <int>
1 Electric Flying     1
2 Electric Steel      2
3 Electric <NA>       6
4 Psychic  Fairy      1
5 Psychic  <NA>       7
6 Rock     Flying     1
7 Rock     Ground     4
8 Rock     Water      4

fct_inorder in east-west travel 1

cats |>
  group_by(name) |>
  summarize(distance = max(long) - min(long)) |>
  arrange(desc(distance)) |>
  ggplot(aes(x = distance, y = fct_inorder(name))) +
  geom_col()

fct_inorder in east-west travel 2

cats |>
  group_by(name) |>
  summarize(distance = max(long) - min(long)) |>
  arrange(desc(distance)) |>
  mutate(name = fct_inorder(name)) |> 
  ggplot(aes(x = distance, y = name)) +
  geom_col()

fct_reorder in east-west travel

cats |>
  group_by(name) |>
  summarize(distance = max(long) - min(long)) |>
  ggplot(aes(x = distance, y = fct_reorder(name, desc(distance)))) +
  geom_col()

Table Pivots

Data: billboard

Song rankings for the Billboard top 100 in the year 2000 from “The Whitburn Project.”

billboard |>
  slice_sample(n = 5) |> # select random rows
  select(1:6) # select the first few columns
# A tibble: 5 × 6
  artist           track                   date.entered   wk1   wk2   wk3
  <chr>            <chr>                   <date>       <dbl> <dbl> <dbl>
1 Houston, Whitney Same Script, Differe... 2000-06-17      71    71    71
2 Cam'ron          What Means The World... 2000-10-14      94    94    96
3 Papa Roach       Last Resort             2000-07-29      75    71    69
4 Jay-Z            Big Pimpin'             2000-04-22      69    52    39
5 Braxton, Toni    Just Be A Man About ... 2000-07-29      76    69    51

pivot_longer()

pivot_longer() lengthens datasets by increasing the number of rows and decreasing the number of columns.

Example

billboard |>  
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    values_to = "rank",
    values_drop_na = TRUE
  ) |> 
  slice_sample(n = 5)
# A tibble: 5 × 5
  artist       track                   date.entered week   rank
  <chr>        <chr>                   <date>       <chr> <dbl>
1 Diffie, Joe  It's Always Somethin... 2000-08-12   wk6      57
2 Jagged Edge  He Can't Love U         1999-12-11   wk8      18
3 Ja Rule      Between Me And You      2000-09-16   wk17     30
4 Sonique      It Feels So Good        2000-01-22   wk9      14
5 Thomas, Carl Summer Rain             2000-09-23   wk2      82

Explanation

  • cols describes which columns need to be reshaped.
    • In this case, we only want to pivot the columns that start with "wk", hence starts_with("wk")
  • names_to gives the name of the column that will be created from the data stored in the column names, i.e. week
  • values_to gives the name of the column that will be created from the data stored in the cell value, i.e. rank

Cleaning

Let’s get the week as a numeric value:

billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    # new:
    names_prefix = "wk",
    names_transform = as.integer,
    values_to = "rank",
    values_drop_na = TRUE,
  ) |> 
  slice_sample(n = 5)
# A tibble: 5 × 5
  artist         track                   date.entered  week  rank
  <chr>          <chr>                   <date>       <int> <dbl>
1 Westlife       Swear It Again          2000-04-01      19    45
2 Hot Boys       I Need A Hot Girl       2000-02-19       7    69
3 Carter, Aaron  Aaron's Party (Come ... 2000-08-26       5    35
4 Ja Rule        Between Me And You      2000-09-16      14    18
5 Chesney, Kenny I Lost It               2000-10-21       5    58

Finding number ones

no1s <- billboard |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    names_prefix = "wk",
    names_transform = as.integer,
    values_to = "rank",
    values_drop_na = TRUE,
  ) |> 
  filter(rank == 1) |> 
  select(artist, track) |> 
  distinct()

no1s
# A tibble: 17 × 2
   artist              track                  
   <chr>               <chr>                  
 1 Aaliyah             Try Again              
 2 Aguilera, Christina Come On Over Baby (A...
 3 Aguilera, Christina What A Girl Wants      
 4 Carey, Mariah       Thank God I Found Yo...
 5 Creed               With Arms Wide Open    
 6 Destiny's Child     Independent Women Pa...
 7 Destiny's Child     Say My Name            
 8 Iglesias, Enrique   Be With You            
 9 Janet               Doesn't Really Matte...
10 Lonestar            Amazed                 
11 Madonna             Music                  
12 N'Sync              It's Gonna Be Me       
13 Santana             Maria, Maria           
14 Savage Garden       I Knew I Loved You     
15 Sisqo               Incomplete             
16 Vertical Horizon    Everything You Want    
17 matchbox twenty     Bent                   

Filter join to get number ones

no1s <- billboard |> 
  semi_join(no1s, by = c("artist", "track"))

no1s <- no1s |> 
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    names_prefix = "wk",
    names_transform = as.integer,
    values_to = "rank",
    values_drop_na = TRUE,
  )

no1s |> 
  slice_head(n = 5)
# A tibble: 5 × 5
  artist  track     date.entered  week  rank
  <chr>   <chr>     <date>       <int> <dbl>
1 Aaliyah Try Again 2000-03-18       1    59
2 Aaliyah Try Again 2000-03-18       2    53
3 Aaliyah Try Again 2000-03-18       3    38
4 Aaliyah Try Again 2000-03-18       4    28
5 Aaliyah Try Again 2000-03-18       5    21

Plot the hits

no1s |> 
  ggplot(aes(x = week, y = rank, color = track)) +
  geom_point() +
  geom_line() +
  labs(title = "Performance of number one hits in 2000") +
  scale_color_viridis_d() +
  scale_y_reverse() + # because 1 is best
  theme_classic()

Plot the hits

Summary

  • pivot_longer lengthens datasets by increasing the number of rows and decreasing the number of columns
  • pivot_wider widens datasets by increasing the number of columns and decreasing the number of rows
    • We will practice pivot_wider in the notebook
  • pivot_longer more commonly needed
    • because the tidyverse expects long data